{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')  # append this repo to PYTHONPATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "import math\n",
    "\n",
    "from tqdm import tqdm\n",
    "from unidecode import unidecode \n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ingesting channel_islands_tnc and channel_islands_tnc_private"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# both animal and human entries are in this label file\n",
    "path_to_image_cct = '/Users/siyuyang/Data/CameraTraps/CCT_JSONs/channel_islands_camera_traps.json'\n",
    "path_to_bbox_cct = path_to_image_cct"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Separate out human and non-human images"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some images have \"empty\" among the species labels but are in fact not empty and has other species labels. All \"empty\" bbox annotations also have a set of coordinates that covers\n",
    " the entire image - these need to be deleted. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_image_cct) as f:\n",
    "    cct_json = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "264321"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(cct_json['annotations'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "114894"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Delete the bounding box in all annotations with \"category_id\": 0 (empty).\n",
    "num_deleted = 0\n",
    "for anno in cct_json['annotations']:\n",
    "    if anno['category_id'] == 0 and 'bbox' in anno:\n",
    "        num_deleted += 1\n",
    "        del anno['bbox']\n",
    "        \n",
    "num_deleted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading image DB...\n",
      "Number of items from the image DB: 245529\n",
      "Number of images with more than 1 species: 14808 (6.03% of image DB)\n",
      "Loading bbox DB...\n",
      "Number of images added from bbox DB entries:  0\n",
      "Number of images amended:  0\n",
      "Number of items in total:  245529\n",
      "Number of images with more than one bounding box: 14808 (6.031059467517076% of all entries)\n",
      "CPU times: user 19 s, sys: 152 ms, total: 19.1 s\n",
      "Wall time: 19.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "embedded = make_cct_embedded(image_db=cct_json, bbox_db=cct_json)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Because we did not have a separate bbox CCT json with the bbox coarse categories, we map the species e.g. rodent, fox, etc to the four bbox categories ('animal', 'person', 'vehicle', 'group').\n",
    "\n",
    "Also get rid of the 'id' field in the image entries. Also for each image, keep 'species' (will be 'class' later) a list of unique species."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "for entry in embedded:\n",
    "    del entry['id']\n",
    "    \n",
    "    entry['annotations']['species'] = list(set(entry['annotations']['species']))\n",
    "    \n",
    "    for box in entry['annotations']['bbox']:\n",
    "        category = box['category']\n",
    "        assert category != 'empty'\n",
    "        \n",
    "        if category == 'human':\n",
    "            box['category'] = 'person'\n",
    "        else:\n",
    "            box['category'] = 'animal' # there were only 5 animal categories: fox, skunk, rodent, bird, other\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'file_name': 'loc-h500hh06211646/008/694.jpg',\n",
       "  'seq_id': 'b741c13f-8ca0-4705-b4d0-9bd6cff03eae',\n",
       "  'seq_num_frames': 20,\n",
       "  'frame_num': 4,\n",
       "  'original_relative_path': '2016_02_Set/RECONYX_N/100RECNX/IMG_8545.JPG',\n",
       "  'location': 'h500hh06211646',\n",
       "  'temperature': '20 c',\n",
       "  'annotations': {'species': ['rodent'],\n",
       "   'bbox': [{'category': 'animal', 'bbox_rel': [0.623, 0.546, 0.0698, 0.115]},\n",
       "    {'category': 'animal', 'bbox_rel': [0.859, 0.252, 0.0898, 0.186]}]}},\n",
       " {'file_name': 'loc-h600hi07237925/008/662.jpg',\n",
       "  'seq_id': '4b7d1d52-1d77-4556-a0b9-2db6f4cfa13b',\n",
       "  'seq_num_frames': 3,\n",
       "  'frame_num': 2,\n",
       "  'original_relative_path': 'Santa_Rosa/IMG_1537.JPG',\n",
       "  'location': 'h600hi07237925',\n",
       "  'temperature': '10 c',\n",
       "  'annotations': {'species': ['empty'], 'bbox': []}},\n",
       " {'file_name': 'loc-h500hh07215885/000/668.jpg',\n",
       "  'seq_id': '899d0680-a20c-48d0-b8c9-0ffa8b58454c',\n",
       "  'seq_num_frames': 2,\n",
       "  'frame_num': 0,\n",
       "  'original_relative_path': '2014_11B_RRSet3/NOV21_2014_RR_16_SD3/2014/2014-12-25/IMG_0395.JPG',\n",
       "  'location': 'h500hh07215885',\n",
       "  'temperature': '15 c',\n",
       "  'annotations': {'species': ['empty'], 'bbox': []}}]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(embedded, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5071"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "240458"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embedded_human = []\n",
    "embedded_others = []\n",
    "\n",
    "for entry in embedded:\n",
    "    if 'human' in entry['annotations']['species']:\n",
    "        embedded_human.append(entry)\n",
    "    else:\n",
    "        embedded_others.append(entry)\n",
    "        \n",
    "len(embedded_human) # 5071 in channel-island-private container\n",
    "len(embedded_others)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## channel_islands_tnc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'channel_islands_tnc'\n",
    "path_to_output = '/Users/siyuyang/Library/CloudStorage/OneDrive-Microsoft/Projects/CameraTrap/Databases/megadb_mdv5/channel_islands_tnc.json' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset_name is set to channel_islands_tnc. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 39%|███▉      | 93890/240458 [00:00<00:00, 469391.13it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 240458 images into sequences...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 240458/240458 [00:00<00:00, 443589.69it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of sequences: 50309\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n",
      "\n",
      "all_img_properties\n",
      "{'class', 'file', 'location', 'original_relative_path', 'temperature', 'frame_num', 'bbox'}\n",
      "\n",
      "img_level_properties\n",
      "{'class', 'file', 'original_relative_path', 'temperature', 'frame_num', 'bbox'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'location'}\n",
      "\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "{\"dataset\": \"channel_islands_tnc\", \"seq_id\": \"836f6487-50fd-42f5-8dcc-336fc538b7a8\", \"location\": \"h500ee07133376\", \"images\": [{\"frame_num\": 0, \"original_relative_path\": \"2011_09_Set/Station%201/2011/2011-09-13/IMG_0001.JPG\", \"temperature\": \"21 c\", \"file\": \"loc-h500ee07133376/000/000.jpg\", \"class\": [\"empty\"], \"bbox\": []}, {\"frame_num\": 1, \"original_relative_path\": \"2011_09_Set/Station%201/2011/2011-09-13/IMG_0002.JPG\", \"temperature\": \"21 c\", \"file\": \"loc-h500ee07133376/000/001.jpg\", \"class\": [\"empty\"], \"bbox\": []}, {\"frame_num\": 2, \"original_relative_path\": \"2011_09_Set/Station%201/2011/2011-09-13/IMG_0003.JPG\", \"temperature\": \"21 c\", \"file\": \"loc-h500ee07133376/000/002.jpg\", \"class\": [\"empty\"], \"bbox\": []}, {\"frame_num\": 4, \"original_relative_path\": \"2011_09_Set/Station%201/2011/2011-09-13/IMG_0005.JPG\", \"temperature\": \"21 c\", \"file\": \"loc-h500ee07133376/000/004.jpg\", \"class\": [\"empty\"], \"bbox\": []}, {\"frame_num\": 5, \"original_relative_path\": \"2011_09_Set/Station%201/2011/2011-09-13/IMG_0006.JPG\", \"temperature\": \"21 c\", \"file\": \"loc-h500ee07133376/000/005.jpg\", \"class\": [\"empty\"], \"bbox\": []}]}\n",
      "\n",
      "{\"dataset\": \"channel_islands_tnc\", \"seq_id\": \"910d7e56-9740-47c2-bf58-3eae24a3fef3\", \"location\": \"h500hh07215685\", \"images\": [{\"frame_num\": 0, \"original_relative_path\": \"Ellie_2016-2017/SC20/SC200SE_20160706/IMG_0544.JPG\", \"temperature\": \"31 c\", \"file\": \"loc-h500hh07215685/001/781.jpg\", \"class\": [\"empty\"], \"bbox\": []}, {\"frame_num\": 1, \"original_relative_path\": \"Ellie_2016-2017/SC20/SC200SE_20160706/IMG_0545.JPG\", \"temperature\": \"31 c\", \"file\": \"loc-h500hh07215685/001/782.jpg\", \"class\": [\"empty\"], \"bbox\": []}, {\"frame_num\": 2, \"original_relative_path\": \"Ellie_2016-2017/SC20/SC200SE_20160706/IMG_0546.JPG\", \"temperature\": \"31 c\", \"file\": \"loc-h500hh07215685/001/783.jpg\", \"class\": [\"empty\"], \"bbox\": []}]}\n",
      "\n",
      "CPU times: user 8.79 s, sys: 109 ms, total: 8.9 s\n",
      "Wall time: 8.94 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences = process_sequences(embedded_others, dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 27.2 s, sys: 96.8 ms, total: 27.3 s\n",
      "Wall time: 27.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## channel_islands_tnc_private"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'channel_islands_tnc_private'\n",
    "path_to_output = '/Users/siyuyang/Library/CloudStorage/OneDrive-Microsoft/Projects/CameraTrap/Databases/megadb_mdv5/channel_islands_tnc_private.json' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5071/5071 [00:00<00:00, 332073.62it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset_name is set to channel_islands_tnc_private. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n",
      "Putting 5071 images into sequences...\n",
      "Number of sequences: 768\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n",
      "\n",
      "all_img_properties\n",
      "{'class', 'file', 'location', 'original_relative_path', 'temperature', 'frame_num', 'bbox'}\n",
      "\n",
      "img_level_properties\n",
      "{'class', 'file', 'original_relative_path', 'temperature', 'frame_num', 'bbox'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'location'}\n",
      "\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "{\"dataset\": \"channel_islands_tnc_private\", \"seq_id\": \"836f6487-50fd-42f5-8dcc-336fc538b7a8\", \"location\": \"h500ee07133376\", \"images\": [{\"frame_num\": 3, \"original_relative_path\": \"2011_09_Set/Station%201/2011/2011-09-13/IMG_0004.JPG\", \"temperature\": \"21 c\", \"file\": \"loc-h500ee07133376/000/003.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0, 0.608, 0.999, 0.362]}]}]}\n",
      "\n",
      "{\"dataset\": \"channel_islands_tnc_private\", \"seq_id\": \"27c13982-2d15-43b4-9d20-db7d7e001e85\", \"location\": \"h550hf07158799\", \"images\": [{\"frame_num\": 0, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0001.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/193.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.287, 0.216, 0.629, 0.752]}]}, {\"frame_num\": 1, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0002.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/194.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.288, 0.176, 0.633, 0.787]}]}, {\"frame_num\": 2, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0003.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/195.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.275, 0.125, 0.579, 0.828]}]}, {\"frame_num\": 3, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0004.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/196.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.283, 0.22, 0.56, 0.749]}]}, {\"frame_num\": 4, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0005.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/197.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.289, 0.235, 0.513, 0.737]}]}, {\"frame_num\": 5, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0006.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/198.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.289, 0.235, 0.527, 0.735]}]}, {\"frame_num\": 6, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0007.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/199.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.315, 0.245, 0.479, 0.726]}]}, {\"frame_num\": 7, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0008.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/200.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.278, 0.0703, 0.643, 0.899]}]}, {\"frame_num\": 8, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0009.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/201.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.0416, 0.128, 0.955, 0.843]}]}, {\"frame_num\": 9, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0010.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/202.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0.0177, 0.0287, 0.652, 0.941]}]}, {\"frame_num\": 10, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0011.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/203.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0, 0.0305, 0.407, 0.938]}]}, {\"frame_num\": 11, \"original_relative_path\": \"Ellie_2016-2017/SC08/SC08001_20161213/IMG_0012.JPG\", \"temperature\": \"19 c\", \"file\": \"loc-h550hf07158799/001/204.jpg\", \"class\": [\"human\"], \"bbox\": [{\"category\": \"person\", \"bbox\": [0, 0.025, 0.217, 0.803]}]}]}\n",
      "\n",
      "CPU times: user 309 ms, sys: 18.2 ms, total: 327 ms\n",
      "Wall time: 328 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences = process_sequences(embedded_human, dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 796 ms, sys: 5.59 ms, total: 802 ms\n",
      "Wall time: 804 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
